Library
library(tidyverse) # data wrangling
library(gutenbergr) # Gutenberg project public domain text
library(quanteda) # quantitative analysis of textual data
library("quanteda.textstats") # text stats extension
library(readtext) # load textual data to corpus
library(stopwords) # alternative package to source stopwords
library(TeXCheckR) # alternative package to source stopwords & process text data
library(cleanNLP) # alternative package to source stopwords & process text data
# Relative file paths
here::i_am("labs/Lab04_Review_Session.Rmd")
library(here)
Anne of Green Gables
# View authors on Gutenberg Project (https://www.gutenberg.org/)
# https://docs.ropensci.org/gutenbergr/index.html
gutenberg_authors %>% filter(str_detect(author, "Montgomery")) %>% select(gutenberg_author_id, author) %>% head(10)
## # A tibble: 10 × 2
## gutenberg_author_id author
## <int> <chr>
## 1 36 Montgomery, L. M. (Lucy Maud)
## 2 2113 Skinner, Charles M. (Charles Montgomery)
## 3 3290 Beck, James M. (James Montgomery)
## 4 5306 Bird, Robert Montgomery
## 5 7342 Montgomery, D. H. (David Henry)
## 6 8392 Montgomery, Frances Trego
## 7 9003 Ward, John Montgomery
## 8 25940 Flagg, James Montgomery
## 9 33486 Montgomery, H. B. (Helen Barrett)
## 10 34150 Montgomery, Rutherford G. (Rutherford George)
# View works
gutenberg_works(gutenberg_author_id == 36) %>% select(gutenberg_id, title, author )
## # A tibble: 17 × 3
## gutenberg_id title author
## <int> <chr> <chr>
## 1 45 Anne of Green Gables Montgomery, L.…
## 2 47 Anne of Avonlea Montgomery, L.…
## 3 51 Anne of the Island Montgomery, L.…
## 4 316 The Golden Road Montgomery, L.…
## 5 544 Anne's House of Dreams Montgomery, L.…
## 6 1354 Chronicles of Avonlea Montgomery, L.…
## 7 3796 Rilla of Ingleside Montgomery, L.…
## 8 5340 Further Chronicles of Avonlea Montgomery, L.…
## 9 5341 Kilmeny of the Orchard Montgomery, L.…
## 10 5342 The Story Girl Montgomery, L.…
## 11 24873 Lucy Maud Montgomery Short Stories, 1896 to 1901 Montgomery, L.…
## 12 24874 Lucy Maud Montgomery Short Stories, 1902 to 1903 Montgomery, L.…
## 13 24875 Lucy Maud Montgomery Short Stories, 1904 Montgomery, L.…
## 14 24876 Lucy Maud Montgomery Short Stories, 1905 to 1906 Montgomery, L.…
## 15 24877 Lucy Maud Montgomery Short Stories, 1907 to 1908 Montgomery, L.…
## 16 24878 Lucy Maud Montgomery Short Stories, 1909 to 1922 Montgomery, L.…
## 17 67979 The Blue Castle: a novel Montgomery, L.…
# Filter to see books with Anne in title
gutenberg_works(gutenberg_author_id == 36) %>% select(gutenberg_id, title, author ) %>% filter(str_detect(title, "Anne"))
## # A tibble: 4 × 3
## gutenberg_id title author
## <int> <chr> <chr>
## 1 45 Anne of Green Gables Montgomery, L. M. (Lucy Maud)
## 2 47 Anne of Avonlea Montgomery, L. M. (Lucy Maud)
## 3 51 Anne of the Island Montgomery, L. M. (Lucy Maud)
## 4 544 Anne's House of Dreams Montgomery, L. M. (Lucy Maud)
anne_books <- gutenberg_works(gutenberg_author_id == 36) %>% filter(str_detect(title, "Anne")) %>%
gutenberg_download(meta_fields = c("title"))
anne_books
## # A tibble: 38,428 × 3
## gutenberg_id text title
## <int> <chr> <chr>
## 1 45 "ANNE OF GREEN GABLES" Anne o…
## 2 45 "" Anne o…
## 3 45 "By Lucy Maud Montgomery" Anne o…
## 4 45 "" Anne o…
## 5 45 "" Anne o…
## 6 45 "" Anne o…
## 7 45 "Table of Contents" Anne o…
## 8 45 "" Anne o…
## 9 45 " CHAPTER I Mrs. Rachel Lynde Is Surprised" Anne o…
## 10 45 " CHAPTER II Matthew Cuthbert Is Surprised" Anne o…
## # ℹ 38,418 more rows
# Create concatenated dataset
anne_books_text <- anne_books %>%
dplyr::group_by(title) %>%
dplyr::summarise(text_whole = paste(text, collapse = " "))
# Replace special quotation mark
anne_books_text$text_whole <- gsub("’", "'", anne_books_text$text_whole)
# View data
anne_books_text
## # A tibble: 4 × 2
## title text_whole
## <chr> <chr>
## 1 Anne of Avonlea [Illustration] ANNE OF AVONLEA by Lucy Maud Montg…
## 2 Anne of Green Gables ANNE OF GREEN GABLES By Lucy Maud Montgomery Table…
## 3 Anne of the Island Anne of the Island by Lucy Maud Montgomery All pr…
## 4 Anne's House of Dreams Anne's House of Dreams by Lucy Maud Montgomery “T…
anne_corpus <- corpus(anne_books_text, docid_field="title", text_field="text_whole" )
anne_corpus
## Corpus consisting of 4 documents.
## Anne of Avonlea :
## "[Illustration] ANNE OF AVONLEA by Lucy Maud Montgomery ..."
##
## Anne of Green Gables :
## "ANNE OF GREEN GABLES By Lucy Maud Montgomery Table of Co..."
##
## Anne of the Island :
## "Anne of the Island by Lucy Maud Montgomery All precious..."
##
## Anne's House of Dreams :
## "Anne's House of Dreams by Lucy Maud Montgomery “To Laur..."
anne_corpus[1]
## Corpus consisting of 1 document.
## Anne of Avonlea :
## "[Illustration] ANNE OF AVONLEA by Lucy Maud Montgomery ..."
# summarize corpus
summary(anne_corpus)
## Corpus consisting of 4 documents, showing 4 documents:
##
## Text Types Tokens Sentences
## Anne of Avonlea 8358 106707 5671
## Anne of Green Gables 8453 121673 6846
## Anne of the Island 8831 92323 5921
## Anne's House of Dreams 7816 97515 5709
# remove punctuation
anne_corpus_tokens_orig <- tokens( anne_corpus, what="word", remove_punct=TRUE )
head( anne_corpus_tokens_orig )
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "Illustration" "ANNE" "OF" "AVONLEA" "by"
## [6] "Lucy" "Maud" "Montgomery" "To" "my"
## [11] "former" "teacher"
## [ ... and 88,798 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "OF" "GREEN" "GABLES" "By"
## [6] "Lucy" "Maud" "Montgomery" "Table" "of"
## [11] "Contents" "CHAPTER"
## [ ... and 103,149 more ]
##
## Anne of the Island :
## [1] "Anne" "of" "the" "Island" "by"
## [6] "Lucy" "Maud" "Montgomery" "All" "precious"
## [11] "things" "discovered"
## [ ... and 76,612 more ]
##
## Anne's House of Dreams :
## [1] "Anne's" "House" "of" "Dreams" "by"
## [6] "Lucy" "Maud" "Montgomery" "To" "Laura"
## [11] "in" "memory"
## [ ... and 80,421 more ]
# convert to lower case
anne_corpus_tokens_orig <- tokens_tolower( anne_corpus_tokens_orig, keep_acronyms=TRUE )
head( anne_corpus_tokens_orig )
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "illustration" "ANNE" "OF" "AVONLEA" "by"
## [6] "lucy" "maud" "montgomery" "to" "my"
## [11] "former" "teacher"
## [ ... and 88,798 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "OF" "GREEN" "GABLES" "by"
## [6] "lucy" "maud" "montgomery" "table" "of"
## [11] "contents" "CHAPTER"
## [ ... and 103,149 more ]
##
## Anne of the Island :
## [1] "anne" "of" "the" "island" "by"
## [6] "lucy" "maud" "montgomery" "all" "precious"
## [11] "things" "discovered"
## [ ... and 76,612 more ]
##
## Anne's House of Dreams :
## [1] "anne's" "house" "of" "dreams" "by"
## [6] "lucy" "maud" "montgomery" "to" "laura"
## [11] "in" "memory"
## [ ... and 80,421 more ]
anne_corpus_tokens <- tokens_remove( anne_corpus_tokens_orig, c( stopwords("english"), "nbsp" ), padding=F )
head(anne_corpus_tokens)
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "illustration" "ANNE" "AVONLEA" "lucy" "maud"
## [6] "montgomery" "former" "teacher" "HATTIE" "GORDON"
## [11] "SMITH" "grateful"
## [ ... and 43,999 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "GREEN" "GABLES" "lucy" "maud"
## [6] "montgomery" "table" "contents" "CHAPTER" "mrs"
## [11] "rachel" "lynde"
## [ ... and 49,929 more ]
##
## Anne of the Island :
## [1] "anne" "island" "lucy" "maud" "montgomery"
## [6] "precious" "things" "discovered" "late" "seek"
## [11] "issue" "forth"
## [ ... and 38,545 more ]
##
## Anne's House of Dreams :
## [1] "anne's" "house" "dreams" "lucy" "maud"
## [6] "montgomery" "laura" "memory" "olden" "time"
## [11] "CONTENTS" "chapter"
## [ ... and 39,065 more ]
anne_corpus_tokens <- tokens_wordstem( anne_corpus_tokens )
anne_corpus_tokens
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "illustr" "ANNE" "AVONLEA" "luci" "maud"
## [6] "montgomeri" "former" "teacher" "HATTIE" "GORDON"
## [11] "SMITH" "grate"
## [ ... and 43,999 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "GREEN" "GABLES" "luci" "maud"
## [6] "montgomeri" "tabl" "content" "CHAPTER" "mrs"
## [11] "rachel" "lynd"
## [ ... and 49,929 more ]
##
## Anne of the Island :
## [1] "ann" "island" "luci" "maud" "montgomeri"
## [6] "precious" "thing" "discov" "late" "seek"
## [11] "issu" "forth"
## [ ... and 38,545 more ]
##
## Anne's House of Dreams :
## [1] "ann" "hous" "dream" "luci" "maud"
## [6] "montgomeri" "laura" "memori" "olden" "time"
## [11] "CONTENTS" "chapter"
## [ ... and 39,065 more ]
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram <- tokens_ngrams( anne_corpus_tokens, n=2 ) %>% dfm()
anne_corpus_ngram %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 said_ann 550 1 4 all
## 2 mrs_lynd 314 2 4 all
## 3 captain_jim 295 3 1 all
## 4 miss_cornelia 224 4 1 all
## 5 green_gabl 198 5 4 all
## 6 said_marilla 197 6 4 all
## 7 mr_harrison 178 7 3 all
## 8 mrs_rachel 164 8 4 all
## 9 miss_lavendar 160 9 3 all
## 10 said_diana 105 10 4 all
## 11 gilbert_blyth 90 11 4 all
## 12 mrs_allan 89 12 4 all
## 13 littl_girl 88 13 4 all
## 14 said_mrs 84 14 4 all
## 15 four_wind 84 14 1 all
## 16 ann_shirley 82 16 4 all
## 17 ann_said 78 17 4 all
## 18 aunt_jamesina 76 18 2 all
## 19 charlotta_fourth 75 19 3 all
## 20 oh_marilla 74 20 3 all
## 21 miss_staci 74 20 3 all
## 22 said_miss 74 20 4 all
## 23 ask_ann 73 23 4 all
## 24 rubi_gilli 71 24 4 all
## 25 mistress_blyth 70 25 1 all
## 26 miss_shirley 69 26 4 all
## 27 littl_hous 69 26 4 all
## 28 ever_sinc 67 28 4 all
## 29 lynd_say 66 29 4 all
## 30 come_back 65 30 4 all
## 31 year_ago 63 31 4 all
## 32 look_like 59 32 4 all
## 33 ann_look 59 32 4 all
## 34 patti_place 58 34 2 all
## 35 want_know 57 35 4 all
## 36 said_davi 57 35 3 all
## 37 well_now 57 35 3 all
## 38 said_gilbert 56 38 4 all
## 39 ann_felt 54 39 4 all
## 40 oh_ann 54 39 4 all
## 41 mrs_doctor 54 39 1 all
## 42 just_like 53 42 4 all
## 43 last_night 49 43 4 all
## 44 mrs_barri 49 43 2 all
## 45 ann_diana 48 45 3 all
## 46 rachel_lynd 47 46 4 all
## 47 one_day 47 46 4 all
## 48 ann_gilbert 47 46 4 all
## 49 owen_ford 47 46 1 all
## 50 mr_phillip 46 50 2 all
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram3 <- tokens_ngrams( anne_corpus_tokens, n=3 ) %>% dfm()
anne_corpus_ngram3 %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 mrs_lynd_say 63 1 3 all
## 2 said_miss_cornelia 45 2 1 all
## 3 miss_shirley_ma'am 43 3 3 all
## 4 mrs_rachel_lynd 38 4 4 all
## 5 said_captain_jim 38 4 1 all
## 6 mrs_doctor_dear 38 4 1 all
## 7 said_mrs_rachel 29 7 4 all
## 8 lake_shine_water 28 8 4 all
## 9 said_aunt_jamesina 24 9 1 all
## 10 said_mrs_lynd 17 10 4 all
## 11 said_miss_lavendar 16 11 1 all
## 12 ann_want_know 15 12 2 all
## 13 four_wind_harbor 15 12 1 all
## 14 said_mr_harrison 14 14 2 all
## 15 mrs_lynd_said 14 14 3 all
## 16 mrs_harmon_andrew 14 14 3 all
## 17 just_like_man 13 17 1 all
## 18 jog_along_black 12 18 1 all
## 19 along_black_mare 12 18 1 all
## 20 race_know_joseph 12 18 1 all
## 21 littl_hous_dream 12 18 1 all
## 22 princ_edward_island 11 22 3 all
## 23 shall_never_forget 11 22 3 all
## 24 never_said_word 11 22 4 all
## 25 ann_said_marilla 11 22 4 all
## 26 come_green_gabl 11 22 4 all
## 27 glen_st_mari 11 22 1 all
## 28 said_ann_dreamili 10 28 4 all
## 29 old_st_john 10 28 1 all
## 30 littl_stone_hous 9 30 1 all
## 31 back_green_gabl 9 30 3 all
## 32 drew_long_breath 9 30 2 all
## 33 davi_said_ann 9 30 2 all
## 34 ann_shook_head 9 30 4 all
## 35 oh_miss_shirley 9 30 1 all
## 36 well_now_dunno 9 30 1 all
## 37 miss_cornelia_said 9 30 1 all
## 38 mr_mrs_allan 8 38 4 all
## 39 green_gabl_ann 8 38 3 all
## 40 said_ann_thought 8 38 3 all
## 41 mr_harmon_andrew 8 38 2 all
## 42 said_ann_soft 8 38 4 all
## 43 _you__know_teacher 8 38 1 all
## 44 mrs_morgan_heroin 8 38 1 all
## 45 allan_miss_staci 8 38 2 all
## 46 young_mari_joe 8 38 2 all
## 47 said_ann_decid 8 38 3 all
## 48 look_much_like 8 38 4 all
## 49 green_gabl_said 8 38 3 all
## 50 oh_said_ann 8 38 2 all
anne_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## ann said marilla go like one mrs never just littl
## 3984 2331 1307 1302 1296 1098 1072 1060 1033 1031
## think know say look come miss well diana thing oh
## 1019 977 927 915 859 817 806 795 767 740
## see old good time gilbert now get even much girl
## 705 692 688 685 656 637 611 610 606 594
## thought want can day ever love home mr came make
## 593 587 586 558 544 541 540 529 520 509
## went must eye alway year tell back seem ask feel
## 501 491 484 483 481 472 464 463 456 454
Stopwords
length(quanteda::stopwords("english"))
## [1] 175
length(stopwords::stopwords(source = "smart"))
## [1] 571
length(stopwords::stopwords(source = "snowball"))
## [1] 175
length(stopwords::stopwords(source = "stopwords-iso"))
## [1] 1298
quanteda::stopwords("english") %>% head(100)
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
stopwords::stopwords(source = "smart") %>% head(100)
## [1] "a" "a's" "able" "about" "above"
## [6] "according" "accordingly" "across" "actually" "after"
## [11] "afterwards" "again" "against" "ain't" "all"
## [16] "allow" "allows" "almost" "alone" "along"
## [21] "already" "also" "although" "always" "am"
## [26] "among" "amongst" "an" "and" "another"
## [31] "any" "anybody" "anyhow" "anyone" "anything"
## [36] "anyway" "anyways" "anywhere" "apart" "appear"
## [41] "appreciate" "appropriate" "are" "aren't" "around"
## [46] "as" "aside" "ask" "asking" "associated"
## [51] "at" "available" "away" "awfully" "b"
## [56] "be" "became" "because" "become" "becomes"
## [61] "becoming" "been" "before" "beforehand" "behind"
## [66] "being" "believe" "below" "beside" "besides"
## [71] "best" "better" "between" "beyond" "both"
## [76] "brief" "but" "by" "c" "c'mon"
## [81] "c's" "came" "can" "can't" "cannot"
## [86] "cant" "cause" "causes" "certain" "certainly"
## [91] "changes" "clearly" "co" "com" "come"
## [96] "comes" "concerning" "consequently" "consider" "considering"
stopwords::stopwords(source = "snowball") %>% head(100)
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
stopwords::stopwords(source = "stopwords-iso") %>% head(100)
## [1] "'ll" "'tis" "'twas" "'ve"
## [5] "10" "39" "a" "a's"
## [9] "able" "ableabout" "about" "above"
## [13] "abroad" "abst" "accordance" "according"
## [17] "accordingly" "across" "act" "actually"
## [21] "ad" "added" "adj" "adopted"
## [25] "ae" "af" "affected" "affecting"
## [29] "affects" "after" "afterwards" "ag"
## [33] "again" "against" "ago" "ah"
## [37] "ahead" "ai" "ain't" "aint"
## [41] "al" "all" "allow" "allows"
## [45] "almost" "alone" "along" "alongside"
## [49] "already" "also" "although" "always"
## [53] "am" "amid" "amidst" "among"
## [57] "amongst" "amoungst" "amount" "an"
## [61] "and" "announce" "another" "any"
## [65] "anybody" "anyhow" "anymore" "anyone"
## [69] "anything" "anyway" "anyways" "anywhere"
## [73] "ao" "apart" "apparently" "appear"
## [77] "appreciate" "appropriate" "approximately" "aq"
## [81] "ar" "are" "area" "areas"
## [85] "aren" "aren't" "arent" "arise"
## [89] "around" "arpa" "as" "aside"
## [93] "ask" "asked" "asking" "asks"
## [97] "associated" "at" "au" "auth"
contractions <- TeXCheckR::valid_English_contractions %>% tolower()
contractions
## [1] "ain't" "aren't" "can't" "could've"
## [5] "couldn't" "couldn't've" "didn't" "doesn't"
## [9] "don't" "hadn't" "hadn't've" "hasn't"
## [13] "haven't" "he'd" "he'd've" "he'll"
## [17] "he's" "how'd" "how'll" "how's"
## [21] "i'd" "i'd've" "i'll" "i'm"
## [25] "i've" "isn't" "it'd" "it'd've"
## [29] "it'll" "it's" "let's" "ma'am"
## [33] "mightn't" "mightn't've" "might've" "mustn't"
## [37] "must've" "needn't" "not've" "o'clock"
## [41] "oughtn't" "'ow's'at" "shan't" "she'd"
## [45] "she'd've" "she'll" "she's" "should've"
## [49] "shouldn't" "shouldn't've" "somebody'd" "somebody'd've"
## [53] "somebody'll" "somebody's" "someone'd" "someone'd've"
## [57] "someone'll" "someone's" "something'd" "something'd've"
## [61] "something'll" "something's" "that'll" "that's"
## [65] "there'd" "there'd've" "there're" "there's"
## [69] "they'd" "they'd've" "they'll" "they're"
## [73] "they've" "'twas" "wasn't" "we'd"
## [77] "we'd've" "we'll" "we're" "we've"
## [81] "weren't" "what'll" "what're" "what's"
## [85] "what've" "when's" "where'd" "where's"
## [89] "where've" "who'd" "who'd've" "who'll"
## [93] "who're" "who's" "who've" "why'll"
## [97] "why're" "why's" "won't" "would've"
## [101] "wouldn't" "wouldn't've" "y'all" "y'all'll"
## [105] "y'all'd've" "you'd" "you'd've" "you'll"
## [109] "you're" "you've"
freq_words <- cleanNLP::word_frequency
length(cleanNLP::word_frequency$word)
## [1] 150000
cleanNLP::word_frequency$word %>% head(100)
## [1] "the" "of" "and" "to" "a"
## [6] "in" "for" "is" "on" "that"
## [11] "by" "this" "with" "i" "you"
## [16] "it" "not" "or" "be" "are"
## [21] "from" "at" "as" "your" "all"
## [26] "have" "new" "more" "an" "was"
## [31] "we" "will" "home" "can" "us"
## [36] "about" "if" "page" "my" "has"
## [41] "search" "free" "but" "our" "one"
## [46] "other" "do" "no" "information" "time"
## [51] "they" "site" "he" "up" "may"
## [56] "what" "which" "their" "news" "out"
## [61] "use" "any" "there" "see" "only"
## [66] "so" "his" "when" "contact" "here"
## [71] "business" "who" "web" "also" "now"
## [76] "help" "get" "pm" "view" "online"
## [81] "c" "e" "first" "am" "been"
## [86] "would" "how" "were" "me" "s"
## [91] "services" "some" "these" "click" "its"
## [96] "like" "service" "x" "than" "find"
Anne of Green Gables Alternate Stopwords
anne_corpus_tokens_alt <- tokens_remove( anne_corpus_tokens_orig, c(stopwords::stopwords(source = "stopwords-iso"), "nbsp"), padding=F )
head(anne_corpus_tokens_alt)
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "illustration" "ANNE" "AVONLEA" "lucy" "maud"
## [6] "montgomery" "teacher" "HATTIE" "GORDON" "SMITH"
## [11] "grateful" "remembrance"
## [ ... and 28,674 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "GREEN" "GABLES" "lucy" "maud"
## [6] "montgomery" "table" "contents" "CHAPTER" "rachel"
## [11] "lynde" "surprised"
## [ ... and 32,589 more ]
##
## Anne of the Island :
## [1] "anne" "island" "lucy" "maud" "montgomery"
## [6] "precious" "discovered" "late" "seek" "issue"
## [11] "love" "sequel"
## [ ... and 25,699 more ]
##
## Anne's House of Dreams :
## [1] "anne's" "house" "dreams" "lucy" "maud"
## [6] "montgomery" "laura" "memory" "olden" "time"
## [11] "CONTENTS" "chapter"
## [ ... and 25,287 more ]
anne_corpus_tokens_alt <- tokens_wordstem( anne_corpus_tokens_alt )
anne_corpus_tokens_alt
## Tokens consisting of 4 documents.
## Anne of Avonlea :
## [1] "illustr" "ANNE" "AVONLEA" "luci" "maud"
## [6] "montgomeri" "teacher" "HATTIE" "GORDON" "SMITH"
## [11] "grate" "remembr"
## [ ... and 28,674 more ]
##
## Anne of Green Gables :
## [1] "ANNE" "GREEN" "GABLES" "luci" "maud"
## [6] "montgomeri" "tabl" "content" "CHAPTER" "rachel"
## [11] "lynd" "surpris"
## [ ... and 32,589 more ]
##
## Anne of the Island :
## [1] "ann" "island" "luci" "maud" "montgomeri"
## [6] "precious" "discov" "late" "seek" "issu"
## [11] "love" "sequel"
## [ ... and 25,699 more ]
##
## Anne's House of Dreams :
## [1] "ann" "hous" "dream" "luci" "maud"
## [6] "montgomeri" "laura" "memori" "olden" "time"
## [11] "CONTENTS" "chapter"
## [ ... and 25,287 more ]
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram_alt <- tokens_ngrams( anne_corpus_tokens_alt, n=2 ) %>% dfm()
anne_corpus_ngram_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 captain_jim 295 1 1 all
## 2 green_gabl 198 2 4 all
## 3 gilbert_blyth 90 3 4 all
## 4 ann_shirley 82 4 4 all
## 5 aunt_jamesina 76 5 2 all
## 6 charlotta_fourth 75 6 3 all
## 7 rubi_gilli 71 7 4 all
## 8 mistress_blyth 70 8 1 all
## 9 ann_gilbert 67 9 4 all
## 10 marilla_ann 66 10 4 all
## 11 ann_diana 66 10 4 all
## 12 ann_ann 61 12 4 all
## 13 ann_marilla 50 13 4 all
## 14 rachel_lynd 47 14 4 all
## 15 owen_ford 47 14 1 all
## 16 white_sand 45 16 3 all
## 17 josi_pye 45 16 2 all
## 18 cri_ann 44 18 4 all
## 19 shirley_ma'am 43 19 3 all
## 20 dick_moor 43 19 1 all
## 21 jane_andrew 42 21 4 all
## 22 ann_look 40 22 4 all
## 23 east_gabl 40 22 3 all
## 24 charli_sloan 40 22 4 all
## 25 mari_joe 39 25 2 all
## 26 hous_dream 38 26 3 all
## 27 shook_head 38 26 4 all
## 28 haunt_wood 38 26 4 all
## 29 diana_ann 36 29 4 all
## 30 time_ann 35 30 4 all
## 31 ann_laugh 35 30 4 all
## 32 kindr_spirit 33 32 4 all
## 33 laugh_ann 33 32 4 all
## 34 moodi_spurgeon 31 34 3 all
## 35 sunday_school 31 34 3 all
## 36 ann_deari 31 34 1 all
## 37 echo_lodg 30 37 3 all
## 38 told_ann 30 37 4 all
## 39 stone_hous 29 39 2 all
## 40 ann_feel 29 39 4 all
## 41 paul_irv 28 41 2 all
## 42 day_ann 28 41 4 all
## 43 lover_lane 28 41 4 all
## 44 lake_shine 28 41 4 all
## 45 shine_water 28 41 4 all
## 46 gilbert_ann 27 46 4 all
## 47 matthew_cuthbert 27 46 2 all
## 48 answer_ann 26 48 4 all
## 49 ann_lesli 26 48 1 all
## 50 sigh_ann 25 50 4 all
# find frequently co-occuring words (typically compound words)
anne_corpus_ngram3_alt <- tokens_ngrams( anne_corpus_tokens_alt, n=3 ) %>% dfm()
anne_corpus_ngram3_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 lake_shine_water 28 1 4 all
## 2 ann_green_gabl 13 2 4 all
## 3 gilbert_blyth_ann 12 3 3 all
## 4 green_gabl_ann 12 3 4 all
## 5 jog_black_mare 12 3 1 all
## 6 princ_edward_island 11 6 3 all
## 7 ann_shook_head 10 7 4 all
## 8 ann_captain_jim 9 8 1 all
## 9 ann_clasp_hand 7 9 2 all
## 10 wrong_upper_stori 7 9 1 all
## 11 green_gabl_even 7 9 3 all
## 12 stay_green_gabl 7 9 2 all
## 13 green_gabl_marilla 7 9 3 all
## 14 moodi_spurgeon_macpherson 7 9 2 all
## 15 mistress_blyth_captain 7 9 1 all
## 16 blyth_captain_jim 7 9 1 all
## 17 captain_jim_ann 7 9 1 all
## 18 white_sand_hotel 6 18 2 all
## 19 ann_gilbert_blyth 6 18 2 all
## 20 gilbert_captain_jim 6 18 1 all
## 21 belong_race_joseph 6 18 1 all
## 22 john_henri_carter 5 22 1 all
## 23 blyth_ann_shirley 5 22 3 all
## 24 arriv_green_gabl 5 22 3 all
## 25 ann_told_marilla 5 22 2 all
## 26 return_green_gabl 5 22 3 all
## 27 green_gabl_day 5 22 3 all
## 28 sit_porch_step 5 22 2 all
## 29 hear_gilbert_blyth 5 22 3 all
## 30 jane_rubi_josi 5 22 1 all
## 31 captain_jim_cornelia 5 22 1 all
## 32 captain_jim_slowli 5 22 1 all
## 33 captain_jim_shook 5 22 1 all
## 34 jim_shook_head 5 22 1 all
## 35 cornelia_captain_jim 5 22 1 all
## 36 captain_jim_told 5 22 1 all
## 37 captain_jim_gilbert 5 22 1 all
## 38 luci_maud_montgomeri 4 38 4 all
## 39 villag_improv_societi 4 38 1 all
## 40 educ_public_sentiment 4 38 1 all
## 41 green_gabl_lane 4 38 2 all
## 42 ann_bed_night 4 38 2 all
## 43 left_green_gabl 4 38 2 all
## 44 ann_orchard_slope 4 38 3 all
## 45 hester_gray_garden 4 38 2 all
## 46 green_gabl_kitchen 4 38 2 all
## 47 uncl_abe_predict 4 38 1 all
## 48 east_gabl_ann 4 38 2 all
## 49 uncl_abe_storm 4 38 1 all
## 50 marri_gilbert_blyth 4 38 3 all
anne_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
## ann marilla diana time gilbert girl day love
## 3984 1307 795 685 656 594 558 541
## eye feel hous davi matthew life told live
## 484 454 446 419 392 385 383 382
## lynd night lesli white talk captain peopl green
## 373 371 365 324 324 318 316 316
## jim hand marri look suppo school beauti mind
## 313 309 308 306 305 302 296 282
## even laugh gabl dream heart imagin cornelia mother
## 282 280 269 259 259 258 258 250
## friend head sit jane hair chapter boy walk
## 248 245 242 242 241 240 240 239
## avonlea hope
## 237 236
# Compare with original
anne_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## ann said marilla go like one mrs never just littl
## 3984 2331 1307 1302 1296 1098 1072 1060 1033 1031
## think know say look come miss well diana thing oh
## 1019 977 927 915 859 817 806 795 767 740
## see old good time gilbert now get even much girl
## 705 692 688 685 656 637 611 610 606 594
## thought want can day ever love home mr came make
## 593 587 586 558 544 541 540 529 520 509
## went must eye alway year tell back seem ask feel
## 501 491 484 483 481 472 464 463 456 454
Marvel
Data Source : https://www.kaggle.com/datasets/phiitm/marvel-cinematic-universe-dialogue-dataset
marvel_script_text <- readtext(here::here("data/marvel/*"))
marvel_corpus <- corpus(marvel_script_text, docid_field="doc_id", text_field="text" )
marvel_corpus
## Corpus consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## "I still think about the night your mother and I had to leave..."
##
## Ant-Man.txt :
## "Stark! He doesn't seem happy. Hello, Hank. You're supposed t..."
##
## Avengers.Age.of.Ultron.txt :
## "(DISTANT EXPLOSION) STRUCKER ON PA: Report to your stations ..."
##
## Avengers.Endgame.txt :
## "Okay, hold on, don't shoot. - You see where you're going? - ..."
##
## Avengers.Infinity.War.txt :
## "This is the Asgardian refugee vessel Statesman. We are under..."
##
## Avengers.txt :
## "The Tesseract has awakened. It is on a little world, a human..."
##
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus[1]
## Corpus consisting of 1 document.
## Ant-Man.And.The.Wasp.txt :
## "I still think about the night your mother and I had to leave..."
# summarize corpus
summary(marvel_corpus)
## Corpus consisting of 23 documents, showing 23 documents:
##
## Text Types Tokens Sentences
## Ant-Man.And.The.Wasp.txt 1979 13654 1913
## Ant-Man.txt 2051 12139 1603
## Avengers.Age.of.Ultron.txt 2410 15275 1812
## Avengers.Endgame.txt 2218 15866 2170
## Avengers.Infinity.War.txt 1919 11879 1637
## Avengers.txt 2269 12623 1654
## Black.Panther.txt 1743 10308 1420
## Captain.America.Civil.War.txt 2499 14502 1693
## Captain.America.The.First.Avenger.txt 1917 9336 1199
## Captain.America.The.Winter.Soldier.txt 2219 10817 1406
## Captain.Marvel.txt 1762 9646 1386
## Doctor.Strange.txt 1698 8907 1219
## Guardians.of.the.Galaxy.txt 1839 10201 1258
## Guardians.of.the.Galaxy.Vol. 2.txt 1904 12031 1479
## Iron-Man.2.txt 2291 14985 1538
## Iron-Man.3.txt 2668 17136 1972
## Iron-Man.txt 2176 12979 1330
## Spider-Man.Far.From.Home.txt 2081 14453 2248
## Spider-Man.Homecoming.txt 2203 15847 2494
## The.Incredible.Hulk.txt 1284 6375 738
## Thor.Ragnarok.txt 1871 11815 1697
## Thor.The.Dark.World.txt 1550 7943 1063
## Thor.txt 1642 8711 1026
# remove punctuation
marvel_corpus_tokens_orig <- tokens( marvel_corpus, what="word", remove_punct=TRUE )
head( marvel_corpus_tokens_orig )
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "I" "still" "think" "about" "the" "night" "your" "mother"
## [9] "and" "I" "had" "to"
## [ ... and 10,179 more ]
##
## Ant-Man.txt :
## [1] "Stark" "He" "doesn't" "seem" "happy" "Hello"
## [7] "Hank" "You're" "supposed" "to" "be" "in"
## [ ... and 9,363 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "ON" "PA"
## [6] "Report" "to" "your" "stations" "immediately"
## [11] "This" "is"
## [ ... and 11,180 more ]
##
## Avengers.Endgame.txt :
## [1] "Okay" "hold" "on" "don't" "shoot" "You" "see" "where"
## [9] "you're" "going" "Mm-hmm" "Okay"
## [ ... and 11,712 more ]
##
## Avengers.Infinity.War.txt :
## [1] "This" "is" "the" "Asgardian" "refugee" "vessel"
## [7] "Statesman" "We" "are" "under" "assault" "I"
## [ ... and 8,907 more ]
##
## Avengers.txt :
## [1] "The" "Tesseract" "has" "awakened" "It" "is"
## [7] "on" "a" "little" "world" "a" "human"
## [ ... and 10,109 more ]
# convert to lower case
marvel_corpus_tokens_orig <- tokens_tolower( marvel_corpus_tokens_orig, keep_acronyms=TRUE )
head( marvel_corpus_tokens_orig )
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "i" "still" "think" "about" "the" "night" "your" "mother"
## [9] "and" "i" "had" "to"
## [ ... and 10,179 more ]
##
## Ant-Man.txt :
## [1] "stark" "he" "doesn't" "seem" "happy" "hello"
## [7] "hank" "you're" "supposed" "to" "be" "in"
## [ ... and 9,363 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "ON" "PA"
## [6] "report" "to" "your" "stations" "immediately"
## [11] "this" "is"
## [ ... and 11,180 more ]
##
## Avengers.Endgame.txt :
## [1] "okay" "hold" "on" "don't" "shoot" "you" "see" "where"
## [9] "you're" "going" "mm-hmm" "okay"
## [ ... and 11,712 more ]
##
## Avengers.Infinity.War.txt :
## [1] "this" "is" "the" "asgardian" "refugee" "vessel"
## [7] "statesman" "we" "are" "under" "assault" "i"
## [ ... and 8,907 more ]
##
## Avengers.txt :
## [1] "the" "tesseract" "has" "awakened" "it" "is"
## [7] "on" "a" "little" "world" "a" "human"
## [ ... and 10,109 more ]
marvel_corpus_tokens <- tokens_remove( marvel_corpus_tokens_orig, c( stopwords("english"), "nbsp" ), padding=F )
head(marvel_corpus_tokens)
## Tokens consisting of 6 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "still" "think" "night" "mother" "leave"
## [6] "hopefully" "long" "call" "get" "settled"
## [11] "better" "indication"
## [ ... and 4,924 more ]
##
## Ant-Man.txt :
## [1] "stark" "seem" "happy" "hello" "hank" "supposed"
## [7] "moscow" "took" "detour" "defense" "lab" "tell"
## [ ... and 4,480 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "PA" "report"
## [6] "stations" "immediately" "drill" "attack" "SOLDIERS"
## [11] "SHOUTING" "INDISTINCTLY"
## [ ... and 5,717 more ]
##
## Avengers.Endgame.txt :
## [1] "okay" "hold" "shoot" "see" "going" "mm-hmm" "okay" "now"
## [9] "worry" "get" "gotta" "move"
## [ ... and 5,533 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refugee" "vessel" "statesman" "assault" "repeat"
## [7] "assault" "engines" "dead" "life" "support" "failing"
## [ ... and 4,127 more ]
##
## Avengers.txt :
## [1] "tesseract" "awakened" "little" "world" "human" "world"
## [7] "wield" "power" "ally" "knows" "workings" "never"
## [ ... and 4,696 more ]
marvel_corpus_tokens <- tokens_wordstem( marvel_corpus_tokens )
marvel_corpus_tokens
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "still" "think" "night" "mother" "leav" "hope" "long" "call"
## [9] "get" "settl" "better" "indic"
## [ ... and 4,924 more ]
##
## Ant-Man.txt :
## [1] "stark" "seem" "happi" "hello" "hank" "suppos" "moscow" "took"
## [9] "detour" "defens" "lab" "tell"
## [ ... and 4,480 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "PA" "report"
## [6] "station" "immedi" "drill" "attack" "SOLDIERS"
## [11] "SHOUTING" "INDISTINCTLi"
## [ ... and 5,717 more ]
##
## Avengers.Endgame.txt :
## [1] "okay" "hold" "shoot" "see" "go" "mm-hmm" "okay" "now"
## [9] "worri" "get" "gotta" "move"
## [ ... and 5,533 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refuge" "vessel" "statesman" "assault" "repeat"
## [7] "assault" "engin" "dead" "life" "support" "fail"
## [ ... and 4,127 more ]
##
## Avengers.txt :
## [1] "tesseract" "awaken" "littl" "world" "human" "world"
## [7] "wield" "power" "alli" "know" "work" "never"
## [ ... and 4,696 more ]
##
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram <- tokens_ngrams( marvel_corpus_tokens, n=2 ) %>% dfm()
marvel_corpus_ngram %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 oh_god 117 1 21 all
## 2 mr_stark 102 2 11 all
## 3 right_now 97 3 21 all
## 4 look_like 87 4 21 all
## 5 go_go 86 5 19 all
## 6 hey_hey 66 6 12 all
## 7 iron_man 59 7 11 all
## 8 okay_okay 56 8 14 all
## 9 get_back 54 9 18 all
## 10 yeah_yeah 53 10 15 all
## 11 toni_stark 52 11 9 all
## 12 wait_wait 50 12 12 all
## 13 come_come 46 13 16 all
## 14 come_back 44 14 17 all
## 15 know_know 44 14 19 all
## 16 captain_america 41 16 8 all
## 17 year_ago 39 17 17 all
## 18 can_get 38 18 15 all
## 19 go_back 36 19 16 all
## 20 new_york 36 19 14 all
## 21 oh_yeah 35 21 17 all
## 22 can_see 35 21 15 all
## 23 need_help 34 23 17 all
## 24 yeah_well 33 24 17 all
## 25 even_know 32 25 20 all
## 26 go_get 32 25 16 all
## 27 gonna_need 32 25 17 all
## 28 let_go 32 25 14 all
## 29 just_like 31 29 17 all
## 30 gonna_go 30 30 13 all
## 31 yes_sir 30 30 13 all
## 32 feel_like 28 32 13 all
## 33 sound_like 28 32 15 all
## 34 yeah_know 28 32 15 all
## 35 one_thing 28 32 17 all
## 36 make_sure 28 32 15 all
## 37 move_move 28 32 9 all
## 38 okay_yeah 28 32 12 all
## 39 right_right 27 39 15 all
## 40 need_get 26 40 13 all
## 41 just_need 26 40 18 all
## 42 just_want 26 40 16 all
## 43 can_help 26 40 17 all
## 44 right_yeah 26 40 12 all
## 45 gotta_go 25 45 6 all
## 46 two_one 25 45 11 all
## 47 bring_back 24 47 10 all
## 48 can_take 24 47 14 all
## 49 tell_us 24 47 13 all
## 50 three_two 24 47 11 all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3 <- tokens_ngrams( marvel_corpus_tokens, n=3 ) %>% dfm()
marvel_corpus_ngram3 %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 go_go_go 33 1 10 all
## 2 wait_wait_wait 24 2 11 all
## 3 three_two_one 22 3 10 all
## 4 hey_hey_hey 20 4 9 all
## 5 yeah_yeah_yeah 15 5 8 all
## 6 okay_okay_okay 12 6 6 all
## 7 come_come_come 10 7 5 all
## 8 well_left_hand 10 7 1 all
## 9 left_hand_free 10 7 1 all
## 10 oh_god_okay 9 10 6 all
## 11 oh_god_oh 8 11 4 all
## 12 move_move_move 8 11 4 all
## 13 hand_free_oh 8 11 1 all
## 14 hail_hydra_hail 8 11 1 all
## 15 hydra_hail_hydra 8 11 1 all
## 16 sea_bass_sea 8 11 1 all
## 17 bass_sea_bass 8 11 1 all
## 18 god_oh_god 7 18 4 all
## 19 five_four_three 7 18 5 all
## 20 whoa_whoa_whoa 7 18 6 all
## 21 five_year_ago 7 18 4 all
## 22 oh_well_left 7 18 1 all
## 23 can_still_hear 7 18 2 all
## 24 hey_big_guy 6 24 3 all
## 25 thor_son_odin 6 24 3 all
## 26 pleas_pleas_pleas 6 24 2 all
## 27 okay_yeah_yeah 6 24 5 all
## 28 free_oh_well 6 24 1 all
## 29 six_year_ago 6 24 3 all
## 30 love_now_never 6 24 1 all
## 31 now_never_love 6 24 1 all
## 32 never_love_can 6 24 1 all
## 33 love_can_still 6 24 1 all
## 34 still_hear_say 6 24 1 all
## 35 hear_say_never 6 24 1 all
## 36 say_never_break 6 24 1 all
## 37 never_break_chain 6 24 1 all
## 38 watch_beeping_rapidli 6 24 1 all
## 39 know_feel_like 5 39 3 all
## 40 look_like_got 5 39 5 all
## 41 four_three_two 5 39 5 all
## 42 oh_thank_god 5 39 5 all
## 43 hey_mr_stark 5 39 4 all
## 44 dormammu_come_bargain 5 39 1 all
## 45 toni_toni_toni 5 39 3 all
## 46 colonel_jame_rhode 5 39 2 all
## 47 feel_like_know 5 39 2 all
## 48 go_right_now 4 48 4 all
## 49 yes_yes_yes 4 48 2 all
## 50 gotta_go_gotta 4 48 4 all
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## know go get just right like yeah come can one got
## 1312 1179 1054 1019 988 890 870 834 802 773 758
## now okay gonna oh want need look think hey time back
## 737 707 647 627 609 608 599 586 558 548 532
## see well us take good thing guy man tell thank say
## 521 517 503 495 488 447 445 413 391 381 377
## yes make stark call toni realli way work someth sorri peopl
## 371 365 361 353 349 336 335 329 315 314 313
## kill tri help never littl give
## 300 293 286 281 276 274
Marvel Alternate Stopwords
marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(stopwords::stopwords(source = "stopwords-iso"), "nbsp" ), padding=F )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "night" "mother" "leave" "settled" "indication"
## [6] "mommy" "jellybean" "daddy" "last-minute" "business"
## [11] "trip" "rose"
## [ ... and 2,786 more ]
##
## Ant-Man.txt :
## [1] "stark" "happy" "hank" "supposed" "moscow" "detour"
## [7] "defense" "lab" "depends" "poor" "attempt" "replicate"
## [ ... and 2,694 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "report" "stations"
## [6] "drill" "attack" "SOLDIERS" "SHOUTING" "INDISTINCTLY"
## [11] "attack" "GRUNTS"
## [ ... and 3,672 more ]
##
## Avengers.Endgame.txt :
## [1] "hold" "shoot" "mm-hmm" "worry" "gotta" "foot" "toe"
## [8] "hips" "yeah" "mm-hmm" "ready" "fingers"
## [ ... and 3,057 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refugee" "vessel" "statesman" "assault" "repeat"
## [7] "assault" "engines" "dead" "life" "support" "failing"
## [ ... and 2,498 more ]
##
## Avengers.txt :
## [1] "tesseract" "awakened" "human" "wield" "power" "ally"
## [7] "workings" "ready" "lead" "force" "chitauri" "follow"
## [ ... and 2,883 more ]
##
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "night" "mother" "leav" "settl" "indic"
## [6] "mommi" "jellybean" "daddi" "last-minut" "busi"
## [11] "trip" "rose"
## [ ... and 2,786 more ]
##
## Ant-Man.txt :
## [1] "stark" "happi" "hank" "suppos" "moscow" "detour" "defens"
## [8] "lab" "depend" "poor" "attempt" "replic"
## [ ... and 2,694 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "report" "station"
## [6] "drill" "attack" "SOLDIERS" "SHOUTING" "INDISTINCTLi"
## [11] "attack" "GRUNTS"
## [ ... and 3,672 more ]
##
## Avengers.Endgame.txt :
## [1] "hold" "shoot" "mm-hmm" "worri" "gotta" "foot" "toe" "hip"
## [9] "yeah" "mm-hmm" "readi" "finger"
## [ ... and 3,057 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refuge" "vessel" "statesman" "assault" "repeat"
## [7] "assault" "engin" "dead" "life" "support" "fail"
## [ ... and 2,498 more ]
##
## Avengers.txt :
## [1] "tesseract" "awaken" "human" "wield" "power" "alli"
## [7] "work" "readi" "lead" "forc" "chitauri" "follow"
## [ ... and 2,883 more ]
##
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 hey_hey 78 1 12 all
## 2 yeah_yeah 63 2 16 all
## 3 wait_wait 54 3 13 all
## 4 toni_stark 53 4 9 all
## 5 captain_america 41 5 8 all
## 6 hey_guy 29 6 14 all
## 7 toni_toni 24 7 5 all
## 8 gonna_gonna 20 8 14 all
## 9 save_life 20 8 17 all
## 10 people_screaming 20 8 3 all
## 11 infin_stone 20 8 8 all
## 12 captain_roger 20 8 4 all
## 13 yeah_gonna 18 13 9 all
## 14 spider-man_spider-man 18 13 2 all
## 15 gonna_die 17 15 11 all
## 16 whoa_whoa 17 15 8 all
## 17 nick_furi 17 15 7 all
## 18 hank_pym 16 18 4 all
## 19 gonna_kill 16 18 9 all
## 20 stark_industri 16 18 3 all
## 21 quantum_realm 15 21 3 all
## 22 son_bitch 15 21 12 all
## 23 hail_hydra 15 21 4 all
## 24 speaking_portuguese 15 21 1 all
## 25 yeah_hey 14 25 7 all
## 26 yeah_time 13 26 9 all
## 27 uh_yeah 13 26 8 all
## 28 bad_guy 13 26 9 all
## 29 love_love 13 26 6 all
## 30 music_playing 13 26 5 all
## 31 dark_dimens 13 26 1 all
## 32 nova_corp 13 26 2 all
## 33 bird_bird 13 26 2 all
## 34 night_monkey 13 26 1 all
## 35 sea_bass 13 26 1 all
## 36 truth_serum 12 36 1 all
## 37 time_time 12 36 8 all
## 38 wait_minut 12 36 9 all
## 39 yeah_guy 12 36 8 all
## 40 grunting_groans 12 36 3 all
## 41 lot_peopl 12 36 8 all
## 42 peter_parker 12 36 3 all
## 43 agent_romanoff 12 36 3 all
## 44 plan_plan 12 36 5 all
## 45 black_panther 12 36 2 all
## 46 watch_beeping 12 36 3 all
## 47 left_hand 12 36 1 all
## 48 dr_pym 11 48 2 all
## 49 ladi_gentlemen 11 48 6 all
## 50 yeah_uh 11 48 8 all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 hey_hey_hey 26 1 10 all
## 2 wait_wait_wait 24 2 11 all
## 3 yeah_yeah_yeah 16 3 8 all
## 4 sea_bass_sea 9 4 1 all
## 5 bass_sea_bass 9 4 1 all
## 6 hail_hydra_hail 8 6 1 all
## 7 hydra_hail_hydra 8 6 1 all
## 8 whoa_whoa_whoa 7 8 6 all
## 9 bird_bird_bird 7 8 1 all
## 10 thor_son_odin 6 10 3 all
## 11 left_hand_left 6 10 1 all
## 12 hand_left_hand 6 10 1 all
## 13 love_love_hear 6 10 1 all
## 14 love_hear_break 6 10 1 all
## 15 hear_break_chain 6 10 1 all
## 16 toni_toni_toni 6 10 4 all
## 17 spider-man_spider-man_spider-man 6 10 2 all
## 18 watch_beeping_rapidli 6 10 1 all
## 19 colonel_jame_rhode 5 19 2 all
## 20 toni_stark_toni 5 19 2 all
## 21 night_monkey_night 5 19 1 all
## 22 monkey_night_monkey 5 19 1 all
## 23 concept_time_space 4 23 1 all
## 24 doctor_stephen_strang 4 23 3 all
## 25 drax_drax_drax 4 23 2 all
## 26 friend_neighborhood_spider-man 4 23 3 all
## 27 prais_ancestor_prais 4 23 1 all
## 28 ancestor_prais_ancestor 4 23 1 all
## 29 mission_report_decemb 4 23 1 all
## 30 report_decemb_16 4 23 1 all
## 31 decemb_16_1991 4 23 1 all
## 32 peopl_gonna_die 4 23 3 all
## 33 draw_power_dark 4 23 1 all
## 34 power_dark_dimens 4 23 1 all
## 35 strang_doctor_strang 4 23 2 all
## 36 tea_drink_tea 4 23 2 all
## 37 break_chain_love 4 23 1 all
## 38 chain_love_love 4 23 1 all
## 39 lieuten_colonel_jame 4 23 1 all
## 40 ceo_stark_industri 4 23 2 all
## 41 latin_speak_latin 4 23 1 all
## 42 toni_stark_trust 4 23 1 all
## 43 night_monkey_yeah 4 23 1 all
## 44 thunder_thunder_thunder 4 23 1 all
## 45 dr_erik_selvig 4 23 2 all
## 46 hope_van_dyne 3 46 1 all
## 47 pym_van_dyne 3 46 1 all
## 48 time_space_irrelev 3 46 1 all
## 49 hey_guy_sun 3 46 2 all
## 50 guy_sun_real 3 46 2 all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
## yeah gonna hey time guy stark toni peopl kill wait
## 870 647 558 548 445 361 349 313 300 269
## power sir talk god suit love life day friend grunt
## 245 244 227 218 192 185 185 182 180 171
## happen father stone come gotta peter hand die head kid
## 170 167 167 166 164 162 161 160 154 152
## weapon call lot told stay thor live feel save captain
## 151 150 146 146 145 145 143 143 142 142
## groan plan uh start wanna hope leav fine nice bring
## 142 133 133 132 130 128 127 126 125 124
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## know go get just right like yeah come can one got
## 1312 1179 1054 1019 988 890 870 834 802 773 758
## now okay gonna oh want need look think hey time back
## 737 707 647 627 609 608 599 586 558 548 532
## see well us take good thing guy man tell thank say
## 521 517 503 495 488 447 445 413 391 381 377
## yes make stark call toni realli way work someth sorri peopl
## 371 365 361 353 349 336 335 329 315 314 313
## kill tri help never littl give
## 300 293 286 281 276 274
Marvel Alternate Stopwords II
marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:5000], contractions, "nbsp" ), padding=F )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "hopefully" "settled" "indication" "mommy" "jellybean"
## [6] "daddy" "last-minute" "ugh" "boring" "goodbye"
## [11] "sweetheart" "janet"
## [ ... and 1,194 more ]
##
## Ant-Man.txt :
## [1] "stark" "hank" "moscow" "detour" "replicate"
## [6] "nerve" "instructed" "remind" "pym" "soldier"
## [11] "scientist" "pym"
## [ ... and 1,148 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "drill" "SHOUTING"
## [6] "INDISTINCTLY" "GRUNTS" "POWERING" "YELLS" "GRUNTING"
## [11] "SCREAMING" "ROARING"
## [ ... and 1,831 more ]
##
## Avengers.Endgame.txt :
## [1] "shoot" "mm-hmm" "worry" "gotta" "hips" "mm-hmm" "fingers"
## [8] "kiddo" "mayo" "mustard" "puts" "mayo"
## [ ... and 1,223 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refugee" "vessel" "statesman" "assault"
## [6] "assault" "failing" "requesting" "vessel" "22"
## [11] "asgard" "asgardian"
## [ ... and 1,121 more ]
##
## Avengers.txt :
## [1] "tesseract" "awakened" "wield" "ally" "workings"
## [6] "chitauri" "humans" "evacuation" "selvig" "surge"
## [11] "tesseract" "authorise"
## [ ... and 1,239 more ]
##
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "hope" "settl" "indic" "mommi" "jellybean"
## [6] "daddi" "last-minut" "ugh" "bore" "goodby"
## [11] "sweetheart" "janet"
## [ ... and 1,194 more ]
##
## Ant-Man.txt :
## [1] "stark" "hank" "moscow" "detour" "replic" "nerv"
## [7] "instruct" "remind" "pym" "soldier" "scientist" "pym"
## [ ... and 1,148 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "DISTANT" "EXPLOSION" "STRUCKER" "drill" "SHOUTING"
## [6] "INDISTINCTLi" "GRUNTS" "POWERING" "YELLS" "GRUNTING"
## [11] "SCREAMING" "ROARING"
## [ ... and 1,831 more ]
##
## Avengers.Endgame.txt :
## [1] "shoot" "mm-hmm" "worri" "gotta" "hip" "mm-hmm" "finger"
## [8] "kiddo" "mayo" "mustard" "put" "mayo"
## [ ... and 1,223 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refuge" "vessel" "statesman" "assault" "assault"
## [7] "fail" "request" "vessel" "22" "asgard" "asgardian"
## [ ... and 1,121 more ]
##
## Avengers.txt :
## [1] "tesseract" "awaken" "wield" "alli" "work" "chitauri"
## [7] "human" "evacu" "selvig" "surg" "tesseract" "authoris"
## [ ... and 1,239 more ]
##
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 stark_stark 30 1 8 all
## 2 spider-man_spider-man 25 2 2 all
## 3 whoa_whoa 17 3 8 all
## 4 hank_pym 16 4 4 all
## 5 grunting_groans 16 4 3 all
## 6 quantum_realm 15 6 3 all
## 7 hulk_hulk 15 6 5 all
## 8 thano_thano 15 6 3 all
## 9 hail_hydra 15 6 4 all
## 10 breath_breath 13 10 4 all
## 11 colonel_rhode 13 10 4 all
## 12 infin_stone 12 12 5 all
## 13 gotta_gotta 11 13 9 all
## 14 thor_thor 11 13 7 all
## 15 hammer_hammer 11 13 6 all
## 16 groot_groot 11 13 3 all
## 17 grunts_groans 10 17 3 all
## 18 sirens_wailing 10 17 4 all
## 19 furi_furi 10 17 4 all
## 20 arc_reactor 10 17 3 all
## 21 monkey_monkey 10 17 2 all
## 22 t'challa_t'challa 10 17 1 all
## 23 helmut_zemo 10 17 1 all
## 24 indistinct_chatter 9 24 3 all
## 25 loki_loki 9 24 4 all
## 26 drax_drax 9 24 2 all
## 27 indistinct_conversations 8 27 2 all
## 28 dude_dude 8 27 2 all
## 29 asgard_asgard 8 27 4 all
## 30 hydra_hail 8 27 1 all
## 31 frost_giant 8 27 1 all
## 32 hurt_hurt 7 32 6 all
## 33 wanna_wanna 7 32 5 all
## 34 breathing_heavili 7 32 3 all
## 35 thor_odin 7 32 4 all
## 36 aveng_aveng 7 32 5 all
## 37 stone_stone 7 32 3 all
## 38 hand_hand 7 32 2 all
## 39 erik_selvig 7 32 3 all
## 40 heart-shap_herb 7 32 1 all
## 41 prais_ancestor 7 32 1 all
## 42 sergeant_barn 7 32 4 all
## 43 lieuten_colonel 7 32 3 all
## 44 ver_ver 7 32 1 all
## 45 drone_drone 7 32 2 all
## 46 stark_internship 7 32 1 all
## 47 beeping_rapidli 7 32 1 all
## 48 indistinct_chattering 7 32 1 all
## 49 heimdal_heimdal 7 32 2 all
## 50 thunder_thunder 7 32 1 all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 spider-man_spider-man_spider-man 11 1 2 all
## 2 stark_stark_stark 10 2 5 all
## 3 hail_hydra_hail 8 3 1 all
## 4 hydra_hail_hydra 8 3 1 all
## 5 whoa_whoa_whoa 7 5 6 all
## 6 t'challa_t'challa_t'challa 7 5 1 all
## 7 dude_dude_dude 6 7 2 all
## 8 hulk_hulk_hulk 6 7 2 all
## 9 monkey_monkey_monkey 6 7 1 all
## 10 drax_drax_drax 5 10 2 all
## 11 breath_breath_breath 5 10 2 all
## 12 lieuten_colonel_rhode 5 10 2 all
## 13 toast_toast_toast 5 10 1 all
## 14 prais_ancestor_prais 4 14 1 all
## 15 ancestor_prais_ancestor 4 14 1 all
## 16 hand_hand_hand 4 14 1 all
## 17 quill_rocket_quill 4 14 2 all
## 18 thunder_thunder_thunder 4 14 1 all
## 19 hank_pym_hank 3 19 2 all
## 20 recalibr_recalibr_recalibr 3 19 1 all
## 21 earth_mightiest_hero 3 19 3 all
## 22 stone_snap_finger 3 19 2 all
## 23 lila_lila_lila 3 19 1 all
## 24 infin_stone_stone 3 19 2 all
## 25 loki_loki_loki 3 19 2 all
## 26 kamar-taj_kamar-taj_kamar-taj 3 19 1 all
## 27 sanctum_sanctum_sanctum 3 19 1 all
## 28 drone_drone_drone 3 19 1 all
## 29 grunting_groans_grunts 3 19 1 all
## 30 ned_liz_liz 3 19 1 all
## 31 beeping_rapidli_beeping 3 19 1 all
## 32 rapidli_beeping_slows 3 19 1 all
## 33 reveng_reveng_reveng 3 19 1 all
## 34 spear_thudding_thor 3 19 1 all
## 35 subatom_deactiv_bomb 2 35 2 all
## 36 anton_anton_anton 2 35 1 all
## 37 hank_pym_dyne 2 35 1 all
## 38 bye_daddi_bye 2 35 1 all
## 39 destroy_life_hank 2 35 1 all
## 40 paraponera_clavata_bullet 2 35 1 all
## 41 clavata_bullet_ant 2 35 1 all
## 42 gotta_darren_hank 2 35 1 all
## 43 entranc_exterior_vent 2 35 1 all
## 44 daddi_cassi_daddi 2 35 1 all
## 45 strucker_soldier_shouting 2 35 1 all
## 46 sooner_stark_sceptr 2 35 1 all
## 47 ultron_stark_jarvis 2 35 1 all
## 48 stark_ultron_jarvis 2 35 1 all
## 49 jarvis_stark_ultron 2 35 1 all
## 50 stark_rhodes_stark 2 35 1 all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
## stark grunt gotta thor groan uh
## 361 171 164 145 142 133
## wanna asgard loki jarvi furi thano
## 130 122 115 104 94 94
## scream aveng whoa damn s.h.i.e.l.d destroy
## 93 91 90 90 89 88
## spider-man roger hulk quill ultron hydra
## 86 82 80 79 78 76
## hank worri gasp pepper groot stone
## 74 72 72 69 69 68
## laugh weapon soldier huh shut excus
## 67 66 65 63 63 62
## cannot hurt realm dude odin wakanda
## 62 61 60 59 58 57
## hammer ronan breath somebodi tesseract daddi
## 56 54 53 50 50 49
## beep ned
## 49 48
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## know go get just right like yeah come can one got
## 1312 1179 1054 1019 988 890 870 834 802 773 758
## now okay gonna oh want need look think hey time back
## 737 707 647 627 609 608 599 586 558 548 532
## see well us take good thing guy man tell thank say
## 521 517 503 495 488 447 445 413 391 381 377
## yes make stark call toni realli way work someth sorri peopl
## 371 365 361 353 349 336 335 329 315 314 313
## kill tri help never littl give
## 300 293 286 281 276 274
Marvel Alternate Stopwords III
marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:10000], contractions, "nbsp" ), padding=F )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "mommy" "jellybean" "last-minute" "ugh" "goodbye"
## [6] "sweetheart" "tucked" "disarm" "plating" "shrink"
## [11] "hank" "regulator"
## [ ... and 703 more ]
##
## Ant-Man.txt :
## [1] "stark" "hank" "detour" "replicate" "instructed"
## [6] "pym" "pym" "errand" "ferocity" "pym"
## [11] "hank" "ferocity"
## [ ... and 709 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "STRUCKER" "SHOUTING" "INDISTINCTLY" "GRUNTS" "POWERING"
## [6] "YELLS" "GRUNTING" "SCREAMING" "ROARING" "jarvis"
## [11] "upstairs" "JARVIS"
## [ ... and 1,336 more ]
##
## Avengers.Endgame.txt :
## [1] "mm-hmm" "hips" "mm-hmm" "kiddo" "mayo" "mustard" "mayo"
## [8] "mustard" "mama" "nate" "mayo" "mustard"
## [ ... and 751 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refugee" "statesman" "22" "asgard"
## [6] "asgardian" "rejoice" "titan" "thanos" "desperately"
## [11] "nonetheless" "frightening"
## [ ... and 702 more ]
##
## Avengers.txt :
## [1] "tesseract" "awakened" "wield" "ally" "workings"
## [6] "chitauri" "evacuation" "selvig" "tesseract" "authorise"
## [11] "selvig" "spontaneous"
## [ ... and 856 more ]
##
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "mommi" "jellybean" "last-minut" "ugh" "goodby"
## [6] "sweetheart" "tuck" "disarm" "plate" "shrink"
## [11] "hank" "regul"
## [ ... and 703 more ]
##
## Ant-Man.txt :
## [1] "stark" "hank" "detour" "replic" "instruct" "pym"
## [7] "pym" "errand" "feroc" "pym" "hank" "feroc"
## [ ... and 709 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "STRUCKER" "SHOUTING" "INDISTINCTLi" "GRUNTS" "POWERING"
## [6] "YELLS" "GRUNTING" "SCREAMING" "ROARING" "jarvi"
## [11] "upstair" "JARVIS"
## [ ... and 1,336 more ]
##
## Avengers.Endgame.txt :
## [1] "mm-hmm" "hip" "mm-hmm" "kiddo" "mayo" "mustard" "mayo"
## [8] "mustard" "mama" "nate" "mayo" "mustard"
## [ ... and 751 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "refuge" "statesman" "22" "asgard"
## [6] "asgardian" "rejoic" "titan" "thano" "desper"
## [11] "nonetheless" "frighten"
## [ ... and 702 more ]
##
## Avengers.txt :
## [1] "tesseract" "awaken" "wield" "alli" "work" "chitauri"
## [7] "evacu" "selvig" "tesseract" "authoris" "selvig" "spontan"
## [ ... and 856 more ]
##
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 stark_stark 42 1 10 all
## 2 spider-man_spider-man 28 2 2 all
## 3 hulk_hulk 19 3 5 all
## 4 thano_thano 18 4 4 all
## 5 whoa_whoa 17 5 8 all
## 6 furi_furi 17 5 4 all
## 7 hank_pym 16 7 4 all
## 8 grunting_groans 16 7 3 all
## 9 hail_hydra 15 9 4 all
## 10 thor_thor 14 10 7 all
## 11 loki_loki 13 11 4 all
## 12 groot_groot 13 11 3 all
## 13 colonel_rhode 13 11 4 all
## 14 breath_breath 12 14 4 all
## 15 t'challa_t'challa 12 14 2 all
## 16 grunts_groans 11 16 3 all
## 17 quill_quill 11 16 3 all
## 18 drax_drax 11 16 2 all
## 19 sirens_wailing 10 19 4 all
## 20 asgard_asgard 10 19 4 all
## 21 helmut_zemo 10 19 1 all
## 22 indistinct_chatter 9 22 3 all
## 23 drone_drone 9 22 2 all
## 24 aveng_aveng 8 24 5 all
## 25 hydra_hail 8 24 1 all
## 26 beeping_beeping 8 24 2 all
## 27 malekith_aether 8 24 1 all
## 28 pym_hank 7 28 3 all
## 29 aveng_stark 7 28 6 all
## 30 groans_groans 7 28 2 all
## 31 stark_ultron 7 28 1 all
## 32 wakanda_wakanda 7 28 2 all
## 33 thor_odin 7 28 4 all
## 34 loki_thor 7 28 3 all
## 35 lieuten_colonel 7 28 3 all
## 36 ver_ver 7 28 1 all
## 37 odin_asgard 7 28 3 all
## 38 ned_ned 7 28 2 all
## 39 indistinct_chattering 7 28 1 all
## 40 heimdal_heimdal 7 28 2 all
## 41 baba_yaga 6 41 1 all
## 42 grunting_screaming 6 41 5 all
## 43 groans_grunts 6 41 2 all
## 44 ultron_stark 6 41 1 all
## 45 ultron_jarvis 6 41 1 all
## 46 s.h.i.e.l.d_s.h.i.e.l.d 6 41 4 all
## 47 swear_swear 6 41 4 all
## 48 grunts_grunts 6 41 2 all
## 49 hand_hand 6 41 1 all
## 50 lawson_lawson 6 41 1 all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 spider-man_spider-man_spider-man 11 1 2 all
## 2 stark_stark_stark 10 2 5 all
## 3 hulk_hulk_hulk 9 3 2 all
## 4 t'challa_t'challa_t'challa 9 3 1 all
## 5 hail_hydra_hail 8 5 1 all
## 6 hydra_hail_hydra 8 5 1 all
## 7 whoa_whoa_whoa 7 7 6 all
## 8 drax_drax_drax 7 7 2 all
## 9 furi_furi_furi 6 9 2 all
## 10 loki_loki_loki 5 10 2 all
## 11 breath_breath_breath 5 10 2 all
## 12 lieuten_colonel_rhode 5 10 2 all
## 13 toast_toast_toast 5 10 1 all
## 14 hank_pym_hank 4 14 2 all
## 15 jarvis_stark_ultron 4 14 1 all
## 16 hand_hand_hand 4 14 1 all
## 17 kamar-taj_kamar-taj_kamar-taj 4 14 1 all
## 18 sanctum_sanctum_sanctum 4 14 1 all
## 19 quill_quill_quill 4 14 2 all
## 20 recalibr_recalibr_recalibr 3 20 1 all
## 21 lila_lila_lila 3 20 1 all
## 22 hydra_johann_schmidt 3 20 1 all
## 23 drone_drone_drone 3 20 1 all
## 24 grunting_groans_grunts 3 20 1 all
## 25 beeping_beeping_slows 3 20 1 all
## 26 malekith_aether_malekith 3 20 1 all
## 27 spear_thudding_thor 3 20 1 all
## 28 anton_anton_anton 2 28 1 all
## 29 hank_pym_dyne 2 28 1 all
## 30 pym_hank_pym 2 28 2 all
## 31 hank_hank_hank 2 28 2 all
## 32 paraponera_clavata_ant 2 28 1 all
## 33 irrelev_shrink_etern 2 28 1 all
## 34 tripl_entranc_vent 2 28 1 all
## 35 whispers_groans_groans 2 28 2 all
## 36 sooner_stark_sceptr 2 28 1 all
## 37 ultron_stark_jarvis 2 28 1 all
## 38 stark_ultron_jarvis 2 28 1 all
## 39 ultron_jarvis_stark 2 28 1 all
## 40 stark_rhodes_stark 2 28 1 all
## 41 rhodey_grunts_groans 2 28 2 all
## 42 grunts_groans_grunting 2 28 2 all
## 43 jarvi_ultron_ultron 2 28 1 all
## 44 grunts_groans_stark 2 28 2 all
## 45 hulk_hulk_aveng 2 28 2 all
## 46 murder_trillion_atom 2 28 1 all
## 47 trillion_atom_inevit 2 28 1 all
## 48 nebula_nebula_morag 2 28 1 all
## 49 nebula_morag_duplic 2 28 1 all
## 50 repeat_repeat_repeat 2 28 1 all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
## stark grunt thor groan asgard loki
## 361 171 145 142 122 115
## jarvi furi thano scream aveng whoa
## 104 94 94 93 91 90
## s.h.i.e.l.d spider-man hulk quill ultron hydra
## 89 86 80 79 78 76
## hank gasp groot huh cannot odin
## 74 72 69 63 62 58
## wakanda ronan tesseract beep ned colonel
## 57 54 50 49 48 46
## barton romanoff pant yondu pym sigh
## 45 44 44 44 43 43
## chuckl vibranium kid t'challa drone idiot
## 40 40 38 38 38 37
## gamora kree indistinct natasha heimdal stole
## 37 37 36 36 36 35
## here throne
## 35 35
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## know go get just right like yeah come can one got
## 1312 1179 1054 1019 988 890 870 834 802 773 758
## now okay gonna oh want need look think hey time back
## 737 707 647 627 609 608 599 586 558 548 532
## see well us take good thing guy man tell thank say
## 521 517 503 495 488 447 445 413 391 381 377
## yes make stark call toni realli way work someth sorri peopl
## 371 365 361 353 349 336 335 329 315 314 313
## kill tri help never littl give
## 300 293 286 281 276 274
Marvel Alternate Stopwords IV
marvel_corpus_tokens_alt <- tokens_remove( marvel_corpus_tokens_orig, c(cleanNLP::word_frequency$word[1:50000], contractions, "nbsp" ), padding=F )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "jellybean" "last-minute" "mother's" "subatomic"
## [5] "burrowed" "lased" "micro-treasure" "show-and-tell"
## [9] "world's" "karapetyan" "overquote" "karapetyan"
## [ ... and 225 more ]
##
## Ant-Man.txt :
## [1] "pym" "pym" "ferocity" "pym" "ferocity" "pym"
## [7] "full-size" "pym" "peachy" "weirdest" "ha-ha" "peachy"
## [ ... and 211 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "STRUCKER" "INDISTINCTLY" "GRUNTS" "GRUNTING" "strucker's"
## [6] "GRUNTING" "loki's" "strucker" "GROANS" "GRUNTS"
## [11] "GROANING" "GRUNTING"
## [ ... and 542 more ]
##
## Avengers.Endgame.txt :
## [1] "mm-hmm" "mm-hmm" "kiddo" "hawk-eye" "soup's"
## [6] "tearjerker" "today's" "21" "22" "infection's"
## [11] "meanie" "48"
## [ ... and 252 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "22" "asgard" "asgardian" "thanos"
## [6] "tesseract" "brother's" "tesseract" "asgard" "asgardian"
## [11] "asgardian" "allfathers"
## [ ... and 256 more ]
##
## Avengers.txt :
## [1] "tesseract" "chitauri" "selvig" "tesseract" "selvig"
## [6] "selvig" "evac" "half-hour" "tesseract's" "2"
## [11] "2" "tesseract"
## [ ... and 314 more ]
##
## [ reached max_ndoc ... 17 more documents ]
marvel_corpus_tokens_alt <- tokens_wordstem( marvel_corpus_tokens_alt )
marvel_corpus_tokens_alt
## Tokens consisting of 23 documents.
## Ant-Man.And.The.Wasp.txt :
## [1] "jellybean" "last-minut" "mother" "subatom"
## [5] "burrow" "lase" "micro-treasur" "show-and-tel"
## [9] "world" "karapetyan" "overquot" "karapetyan"
## [ ... and 225 more ]
##
## Ant-Man.txt :
## [1] "pym" "pym" "feroc" "pym" "feroc" "pym"
## [7] "full-siz" "pym" "peachi" "weirdest" "ha-ha" "peachi"
## [ ... and 211 more ]
##
## Avengers.Age.of.Ultron.txt :
## [1] "STRUCKER" "INDISTINCTLi" "GRUNTS" "GRUNTING" "strucker"
## [6] "GRUNTING" "loki" "strucker" "GROANS" "GRUNTS"
## [11] "GROANING" "GRUNTING"
## [ ... and 542 more ]
##
## Avengers.Endgame.txt :
## [1] "mm-hmm" "mm-hmm" "kiddo" "hawk-ey" "soup" "tearjerk"
## [7] "today" "21" "22" "infect" "meani" "48"
## [ ... and 252 more ]
##
## Avengers.Infinity.War.txt :
## [1] "asgardian" "22" "asgard" "asgardian" "thano" "tesseract"
## [7] "brother" "tesseract" "asgard" "asgardian" "asgardian" "allfath"
## [ ... and 256 more ]
##
## Avengers.txt :
## [1] "tesseract" "chitauri" "selvig" "tesseract" "selvig" "selvig"
## [7] "evac" "half-hour" "tesseract" "2" "2" "tesseract"
## [ ... and 314 more ]
##
## [ reached max_ndoc ... 17 more documents ]
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=2 ) %>% dfm()
marvel_corpus_ngram_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 spider-man_spider-man 36 1 2 all
## 2 thano_thano 26 2 4 all
## 3 asgard_asgard 25 3 5 all
## 4 ultron_ultron 21 4 1 all
## 5 grunting_groans 18 5 3 all
## 6 s.h.i.e.l.d_s.h.i.e.l.d 16 6 5 all
## 7 drax_drax 14 7 2 all
## 8 t'challa_t'challa 14 7 2 all
## 9 grunts_groans 13 9 4 all
## 10 wakanda_wakanda 13 9 3 all
## 11 groans_grunts 12 11 3 all
## 12 groans_groans 12 11 4 all
## 13 grunts_grunting 11 13 3 all
## 14 groans_grunting 10 14 3 all
## 15 heimdal_heimdal 10 14 2 all
## 16 malekith_aether 10 14 1 all
## 17 beeping_beeping 9 17 2 all
## 18 ant-man_ant-man 8 18 4 all
## 19 groaning_grunting 8 18 3 all
## 20 gasps_groans 8 18 4 all
## 21 1_2 8 18 4 all
## 22 vibranium_wakanda 8 18 2 all
## 23 grunts_grunts 8 18 2 all
## 24 pym_pym 7 24 3 all
## 25 romanoff_romanoff 7 24 4 all
## 26 gamora_thano 7 24 3 all
## 27 tesseract_tesseract 7 24 2 all
## 28 klaue_vibranium 7 24 1 all
## 29 hand_hand 7 24 1 all
## 30 dormammu_dormammu 7 24 1 all
## 31 yondu_yondu 7 24 2 all
## 32 indistinct_chattering 7 24 1 all
## 33 sun_sun 7 24 1 all
## 34 chuckles_gasps 6 34 3 all
## 35 gamora_gamora 6 34 2 all
## 36 gasps_grunts 6 34 2 all
## 37 kree_kree 6 34 3 all
## 38 kamar-taj_kamar-taj 6 34 1 all
## 39 sanctum_sanctum 6 34 1 all
## 40 blip_blip 6 34 1 all
## 41 pym_dyne 5 41 1 all
## 42 groans_gasps 5 41 3 all
## 43 2_3 5 41 5 all
## 44 vibranium_vibranium 5 41 2 all
## 45 cannot_cannot 5 41 4 all
## 46 2_2 5 41 3 all
## 47 s.h.i.e.l.d_tesseract 5 41 1 all
## 48 t'challa_wakanda 5 41 1 all
## 49 grunting_grunting 5 41 3 all
## 50 skrull_skrull 5 41 1 all
# find frequently co-occuring words (typically compound words)
marvel_corpus_ngram3_alt <- tokens_ngrams( marvel_corpus_tokens_alt, n=3 ) %>% dfm()
marvel_corpus_ngram3_alt %>% textstat_frequency( n=50 )
## feature frequency rank docfreq group
## 1 spider-man_spider-man_spider-man 18 1 2 all
## 2 drax_drax_drax 9 2 2 all
## 3 t'challa_t'challa_t'challa 9 2 1 all
## 4 ultron_ultron_ultron 8 4 1 all
## 5 asgard_asgard_asgard 7 5 1 all
## 6 thano_thano_thano 6 6 3 all
## 7 heimdal_heimdal_heimdal 5 7 2 all
## 8 hand_hand_hand 4 8 1 all
## 9 kamar-taj_kamar-taj_kamar-taj 4 8 1 all
## 10 sanctum_sanctum_sanctum 4 8 1 all
## 11 grunting_groans_grunts 4 8 1 all
## 12 recalibr_recalibr_recalibr 3 12 1 all
## 13 groans_grunting_groans 3 12 2 all
## 14 gamora_thano_thano 3 12 2 all
## 15 grunts_grunts_groans 3 12 2 all
## 16 dormammu_dormammu_dormammu 3 12 1 all
## 17 blip_blip_blip 3 12 1 all
## 18 asgard_surtur_asgard 3 12 1 all
## 19 surtur_asgard_asgard 3 12 1 all
## 20 sun_sun_sun 3 12 1 all
## 21 malekith_aether_malekith 3 12 1 all
## 22 asgard_malekith_aether 3 12 1 all
## 23 yaga_yaga_yaga 2 23 1 all
## 24 pym_feroc_pym 2 23 1 all
## 25 ant-man_ant-man_yellowjacket 2 23 1 all
## 26 groans_grunts_groaning 2 23 2 all
## 27 groans_groans_gasps 2 23 2 all
## 28 rhodey_grunts_groans 2 23 2 all
## 29 grunts_groans_grunting 2 23 2 all
## 30 ultron_romanoff_ultron 2 23 1 all
## 31 wakanda_wakanda_wakanda 2 23 2 all
## 32 ultron_man_ultron 2 23 1 all
## 33 jarvi_ultron_ultron 2 23 1 all
## 34 groans_groans_grunting 2 23 2 all
## 35 romanoff_groans_grunts 2 23 1 all
## 36 grunting_groaning_grunting 2 23 2 all
## 37 gamora_gamora_gamora 2 23 1 all
## 38 tesseract_selvig_selvig 2 23 1 all
## 39 wakandan_vibranium_wakanda 2 23 2 all
## 40 vibranium_vibranium_wakanda 2 23 1 all
## 41 wakanda_t'challa_wakanda 2 23 1 all
## 42 nakia_wakanda_bast 2 23 1 all
## 43 t'challa_wakanda_wakanda 2 23 1 all
## 44 zuri_zuri_zuri 2 23 1 all
## 45 nakia_wakanda_wakanda 2 23 1 all
## 46 shuri_shuri_wakanda 2 23 1 all
## 47 grunts_growls_groaning 2 23 1 all
## 48 binarili_retro-fram_barf 2 23 2 all
## 49 retro-fram_barf_$ 2 23 2 all
## 50 barf_$_611 2 23 2 all
marvel_corpus_tokens_alt %>% dfm() %>% dfm_wordstem() %>% topfeatures(50)
## grunt groan asgard thano s.h.i.e.l.d spider-man
## 171 139 122 94 89 86
## ultron cannot wakanda gasp tesseract beep
## 78 62 57 53 50 46
## romanoff yondu pym vibranium chuckl t'challa
## 44 44 43 40 39 38
## gamora kree indistinct heimdal here growl
## 37 37 36 36 35 34
## 10 drax skrull 30 one 2
## 33 33 31 29 26 26
## coulson 1 rhodey klaue selvig �
## 26 24 24 24 24 23
## ant-man aether 20 strucker asgardian xandar
## 23 23 22 22 22 21
## bifrost dormammu malekith world stark jotunheim
## 21 21 21 19 19 19
## sokovia guy
## 18 18
# Compare with original
marvel_corpus_tokens %>% dfm() %>% dfm_wordstem() %>% topfeatures( 50 )
## know go get just right like yeah come can one got
## 1312 1179 1054 1019 988 890 870 834 802 773 758
## now okay gonna oh want need look think hey time back
## 737 707 647 627 609 608 599 586 558 548 532
## see well us take good thing guy man tell thank say
## 521 517 503 495 488 447 445 413 391 381 377
## yes make stark call toni realli way work someth sorri peopl
## 371 365 361 353 349 336 335 329 315 314 313
## kill tri help never littl give
## 300 293 286 281 276 274